package com.tensquare.usercrawler.processor;

import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 用户爬取类
 */
@Component
public class UserProcessor implements PageProcessor {
    @Override
    public void process(Page page) {
        page.addTargetRequests(  page.getHtml().links().regex("https://blog.csdn.net/[a-z A-Z 0-9 -]+/article/details/[0-9]{9}").all());

        //昵称 和头像
        String nickname=page.getHtml().xpath("//*[@id=\"uid\"]/span").get();
        String image=page.getHtml().xpath("//*[@id=\"asideProfile\"]/div[1]/div[1]/a/img").css("img","src").toString();
        if(nickname!=null && image!=null){
            page.putField("nickname" ,nickname );
            page.putField("image",image);
        }else{
            page.setSkip(true);//跳过
        }
    }

    @Override
    public Site getSite() {
        return Site.me().setRetryTimes(3000).setSleepTime(100);
    }
}
